library(readr)
library(dplyr)
package ‘dplyr’ was built under R version 3.4.4
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyr)
package ‘tidyr’ was built under R version 3.4.4
library(ggplot2)
package ‘ggplot2’ was built under R version 3.4.4RStudio Community is a great place to get help: https://community.rstudio.com/c/tidyverse.
library(glmnet)
package ‘glmnet’ was built under R version 3.4.4Loading required package: Matrix
package ‘Matrix’ was built under R version 3.4.4
Attaching package: ‘Matrix’
The following object is masked from ‘package:tidyr’:
expand
Loading required package: foreach
package ‘foreach’ was built under R version 3.4.3Loaded glmnet 2.0-16
library(gbm)
package ‘gbm’ was built under R version 3.4.4Loaded gbm 2.1.4
library(gam)
package ‘gam’ was built under R version 3.4.4Loading required package: splines
Loaded gam 1.16
library(stringr)
package ‘stringr’ was built under R version 3.4.4
library(xgboost)
package ‘xgboost’ was built under R version 3.4.4
Attaching package: ‘xgboost’
The following object is masked from ‘package:dplyr’:
slice
library(caret)
package ‘caret’ was built under R version 3.4.4Loading required package: lattice
unknown timezone 'zone/tz/2018g.1.0/zoneinfo/America/New_York'
library(Matrix)
library(e1071)
package ‘e1071’ was built under R version 3.4.4
library(liquidSVM)
Attaching package: ‘liquidSVM’
The following object is masked from ‘package:e1071’:
svm
# read data
read_csv("Data/wine-reviews/winemag-data-130k-v2.csv") %>% select(-X1) %>% unique -> data
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
country = col_character(),
description = col_character(),
designation = col_character(),
points = col_integer(),
price = col_double(),
province = col_character(),
region_1 = col_character(),
region_2 = col_character(),
taster_name = col_character(),
taster_twitter_handle = col_character(),
title = col_character(),
variety = col_character(),
winery = col_character()
)
|== | 3% 1 MB
|=== | 3% 1 MB
|=== | 4% 2 MB
|==== | 4% 2 MB
|==== | 5% 2 MB
|===== | 5% 3 MB
|===== | 6% 3 MB
|====== | 7% 3 MB
|====== | 7% 3 MB
|======= | 8% 4 MB
|======= | 8% 4 MB
|======== | 9% 4 MB
|======== | 9% 4 MB
|========= | 10% 5 MB
|========= | 10% 5 MB
|========== | 11% 5 MB
|========== | 12% 6 MB
|=========== | 12% 6 MB
|=========== | 13% 6 MB
|============ | 13% 6 MB
|============ | 14% 7 MB
|============= | 14% 7 MB
|============= | 15% 7 MB
|============== | 15% 8 MB
|============== | 16% 8 MB
|=============== | 16% 8 MB
|=============== | 17% 8 MB
|================ | 18% 9 MB
|================ | 18% 9 MB
|================= | 19% 9 MB
|================= | 19% 9 MB
|================== | 20% 10 MB
|================== | 20% 10 MB
|=================== | 21% 10 MB
|=================== | 21% 11 MB
|==================== | 22% 11 MB
|==================== | 23% 11 MB
|===================== | 23% 11 MB
|===================== | 24% 12 MB
|====================== | 24% 12 MB
|====================== | 25% 12 MB
|======================= | 25% 12 MB
|======================= | 26% 13 MB
|======================== | 26% 13 MB
|======================== | 27% 13 MB
|========================= | 27% 14 MB
|========================= | 28% 14 MB
|========================== | 29% 14 MB
|========================== | 29% 14 MB
|=========================== | 30% 15 MB
|=========================== | 30% 15 MB
|============================ | 31% 15 MB
|============================ | 31% 16 MB
|============================= | 32% 16 MB
|============================= | 32% 16 MB
|============================== | 33% 16 MB
|============================== | 33% 17 MB
|=============================== | 34% 17 MB
|=============================== | 35% 17 MB
|================================ | 35% 18 MB
|================================ | 36% 18 MB
|================================= | 36% 18 MB
|================================= | 37% 18 MB
|================================== | 37% 19 MB
|================================== | 38% 19 MB
|=================================== | 38% 19 MB
|=================================== | 39% 19 MB
|==================================== | 40% 20 MB
|==================================== | 40% 20 MB
|===================================== | 41% 20 MB
|===================================== | 41% 21 MB
|====================================== | 42% 21 MB
|====================================== | 42% 21 MB
|======================================= | 43% 21 MB
|======================================= | 43% 22 MB
|======================================== | 44% 22 MB
|======================================== | 45% 22 MB
|========================================= | 45% 23 MB
|========================================== | 46% 23 MB
|========================================== | 46% 23 MB
|========================================== | 47% 23 MB
|=========================================== | 47% 24 MB
|=========================================== | 48% 24 MB
|============================================ | 48% 24 MB
|============================================ | 49% 24 MB
|============================================= | 49% 25 MB
|============================================= | 50% 25 MB
|============================================== | 51% 25 MB
|============================================== | 51% 26 MB
|=============================================== | 52% 26 MB
|=============================================== | 52% 26 MB
|================================================ | 53% 26 MB
|================================================ | 53% 27 MB
|================================================= | 54% 27 MB
|================================================= | 54% 27 MB
|================================================== | 55% 27 MB
|================================================== | 55% 28 MB
|=================================================== | 56% 28 MB
|=================================================== | 57% 28 MB
|==================================================== | 57% 29 MB
|==================================================== | 58% 29 MB
|===================================================== | 58% 29 MB
|===================================================== | 59% 29 MB
|====================================================== | 59% 30 MB
|====================================================== | 60% 30 MB
|======================================================= | 60% 30 MB
|======================================================= | 61% 31 MB
|======================================================== | 62% 31 MB
|======================================================== | 62% 31 MB
|========================================================= | 63% 31 MB
|========================================================= | 63% 32 MB
|========================================================== | 64% 32 MB
|========================================================== | 64% 32 MB
|=========================================================== | 65% 32 MB
|============================================================ | 65% 33 MB
|============================================================ | 66% 33 MB
|============================================================= | 67% 33 MB
|============================================================= | 67% 34 MB
|============================================================== | 68% 34 MB
|============================================================== | 68% 34 MB
|=============================================================== | 69% 34 MB
|=============================================================== | 69% 35 MB
|================================================================ | 70% 35 MB
|================================================================ | 70% 35 MB
|================================================================= | 71% 36 MB
|================================================================= | 72% 36 MB
|================================================================== | 72% 36 MB
|================================================================== | 73% 36 MB
|=================================================================== | 73% 37 MB
|=================================================================== | 74% 37 MB
|=================================================================== | 74% 37 MB
|==================================================================== | 75% 37 MB
|==================================================================== | 75% 38 MB
|===================================================================== | 76% 38 MB
|===================================================================== | 76% 38 MB
|====================================================================== | 77% 39 MB
|====================================================================== | 78% 39 MB
|======================================================================= | 78% 39 MB
|======================================================================= | 79% 39 MB
|======================================================================== | 79% 40 MB
|======================================================================== | 80% 40 MB
|========================================================================= | 80% 40 MB
|========================================================================== | 81% 41 MB
|========================================================================== | 81% 41 MB
|========================================================================== | 82% 41 MB
|=========================================================================== | 82% 41 MB
|=========================================================================== | 83% 42 MB
|============================================================================ | 84% 42 MB
|============================================================================ | 84% 42 MB
|============================================================================= | 85% 42 MB
|============================================================================= | 85% 43 MB
|============================================================================== | 86% 43 MB
|============================================================================== | 86% 43 MB
|=============================================================================== | 87% 44 MB
|=============================================================================== | 87% 44 MB
|================================================================================ | 88% 44 MB
|================================================================================ | 88% 44 MB
|================================================================================= | 89% 45 MB
|================================================================================= | 90% 45 MB
|================================================================================== | 90% 45 MB
|================================================================================== | 91% 46 MB
|=================================================================================== | 91% 46 MB
|=================================================================================== | 92% 46 MB
|==================================================================================== | 92% 46 MB
|==================================================================================== | 93% 47 MB
|===================================================================================== | 93% 47 MB
|===================================================================================== | 94% 47 MB
|====================================================================================== | 95% 47 MB
|====================================================================================== | 95% 48 MB
|======================================================================================= | 96% 48 MB
|======================================================================================= | 96% 48 MB
|======================================================================================== | 97% 49 MB
|======================================================================================== | 97% 49 MB
|========================================================================================= | 98% 49 MB
|==========================================================================================| 98% 49 MB
|==========================================================================================| 99% 50 MB
|===========================================================================================| 100% 50 MB
# preprocessing
scale_taster <- function(points){
# takes a vector of numbers, subtracts every element by the mean of the vector, and then
# divides every element by the standard deviation of the vector
return((points - mean(points, na.rm = TRUE)) / sd(points, na.rm = TRUE))
}
percentile_taster <- function(x){
# takes a vector of numbers, ranks every element and divides by n, giving the percentile of each element
trunc(rank(x))/length(x) * 100
}
data <- data %>% group_by(taster_name) %>% mutate("Scaled_Points" = scale_taster(points))
data <- data %>% group_by(taster_name) %>% mutate("Percentile_Points" = percentile_taster(points))
tab <- data %>% group_by(province) %>% summarize("Proportion" = n()/nrow(data))
tab <- tab[tab$Proportion > 0.01, ]
tabcountry <- data %>% group_by(country) %>% summarize("Proportion" = n()/nrow(data))
tabcountry <- tabcountry[tabcountry$Proportion > 0.01, ]
data$country_other <- ifelse(data$country %in% tabcountry$country,
paste0(data$country, "_other"), data$country)
data$location <- ifelse(data$province %in% tab$province, data$province,
data$country_other)
year <- str_extract_all(data$title, "[1-2][09][0-9]{2}")
data$year <- lapply(year, function(x){
x = x %>% as.numeric
if(!all(is.na(x))){
newx <- x[(x > 1900) & (x < 2018)]
if (!all(is.na(newx))) {
newx <- max(newx)
return(newx)
} else {
return(NA)
}
} else {
return(NA)
}}) %>% unlist
data$location <- factor(data$location)
data$taster_name <- factor(data$taster_name)
data$taster_name <- addNA(data$taster_name)
data$title <- factor(data$title)
data$variety <- factor(data$variety)
data$region_1 <- factor(data$region_1)
data$region_2 <- factor(data$region_2)
data$country <- factor(data$country)
data$province <- factor(data$province)
data$winery <- factor(data$winery)
data$taster_twitter_handle <- factor(data$taster_twitter_handle)
data$designation <- factor(data$designation)
# helper function
impute_mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
# impute_mean replaces missing values with the average value of a group
clean <- function(df){
# clean removes the varieties that only have missing prices, and are thus unimputable by our rule,
# and then it imputes the remaining missing prices using the average price of that wine's variety
df %>% group_by(variety) %>% summarize("Average_Price" = mean(price, na.rm = T),
"Count" = n()) %>%
filter(is.na(Average_Price)) %>% select(variety) %>% unlist() -> drop_variety
df %>% filter(!(variety %in% drop_variety)) -> sample2
sample2 %>% group_by(variety) %>% mutate(price = impute_mean(price)) -> sample2
sample2 <- sample2[complete.cases(sample2),]
return(sample2)
}
# check data
dim(data)
[1] 119988 18
summary(data)
country description designation points price
US :50457 Length:119988 Reserve : 1871 Min. : 80.00 Min. : 4.00
France :20353 Class :character Estate : 1223 1st Qu.: 86.00 1st Qu.: 17.00
Italy :17940 Mode :character Reserva : 1176 Median : 88.00 Median : 25.00
Spain : 6116 Riserva : 647 Mean : 88.44 Mean : 35.62
Portugal: 5256 Estate Grown: 567 3rd Qu.: 91.00 3rd Qu.: 42.00
(Other) :19807 (Other) :79959 Max. :100.00 Max. :3300.00
NA's : 59 NA's :34545 NA's :8395
province region_1 region_2 taster_name
California:33656 Napa Valley : 4174 Central Coast :10233 NA :24917
Washington: 7965 Columbia Valley (WA): 3795 Sonoma : 8390 Roger Voss :23560
Bordeaux : 5556 Russian River Valley: 2862 Columbia Valley : 7466 Michael Schachner:14046
Tuscany : 5391 California : 2468 Napa : 6369 Kerin O’Keefe : 9697
Oregon : 4929 Paso Robles : 2155 Willamette Valley: 3142 Paul Gregutt : 8868
(Other) :62432 (Other) :84974 (Other) :11169 Virginie Boone : 8708
NA's : 59 NA's :19560 NA's :73219 (Other) :30192
taster_twitter_handle title
@vossroger :23560 Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County) : 9
@wineschach :14046 Segura Viudas NV Aria Estate Extra Dry Sparkling (Cava) : 7
@kerinokeefe: 9697 Segura Viudas NV Extra Dry Sparkling (Cava) : 7
@paulgwine : 8868 Bailly-Lapierre NV Brut (Crémant de Bourgogne) : 6
@vboone : 8708 Gloria Ferrer NV Blanc de Noirs Sparkling (Carneros) : 6
(Other) :25663 J Vineyards & Winery NV Brut Rosé Sparkling (Russian River Valley): 6
NA's :29446 (Other) :119947
variety winery Scaled_Points Percentile_Points
Pinot Noir :12278 Wines & Winemakers: 211 Min. :-4.3836 Min. : 0.01031
Chardonnay :10868 Williams Selyem : 204 1st Qu.:-0.7342 1st Qu.: 23.96647
Cabernet Sauvignon : 8840 Testarossa : 201 Median :-0.0236 Median : 48.31980
Red Blend : 8243 DFJ Vinhos : 200 Mean : 0.0000 Mean : 50.00378
Bordeaux-style Red Blend: 6471 Louis Latour : 192 3rd Qu.: 0.6985 3rd Qu.: 73.33108
(Other) :73287 Georges Duboeuf : 186 Max. : 4.3888 Max. :100.00000
NA's : 1 (Other) :118794
country_other location year
Length:119988 California :33656 Min. :1904
Class :character Washington : 7965 1st Qu.:2009
Mode :character Bordeaux : 5556 Median :2011
Tuscany : 5391 Mean :2011
Portugal_other: 5256 3rd Qu.:2013
(Other) :62105 Max. :2017
NA's : 59 NA's :4285
str(data)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 119988 obs. of 18 variables:
$ country : Factor w/ 43 levels "Argentina","Armenia",..: 23 32 43 43 43 38 23 16 18 16 ...
$ description : chr "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering un"| __truncated__ "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy r"| __truncated__ "Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp ac"| __truncated__ "Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with note"| __truncated__ ...
$ designation : Factor w/ 37979 levels "??? Vineyard",..: 36976 2352 NA 28123 36715 1996 3051 NA 30971 20046 ...
$ points : int 87 87 87 87 87 87 87 87 87 87 ...
$ price : num NA 15 14 13 65 15 16 24 12 27 ...
$ province : Factor w/ 425 levels "Achaia","Aconcagua Costa",..: 334 110 269 220 269 263 334 12 310 12 ...
$ region_1 : Factor w/ 1229 levels "Abruzzo","Adelaida District",..: 425 NA 1218 550 1218 758 1205 22 NA 22 ...
$ region_2 : Factor w/ 17 levels "California Other",..: NA NA 17 NA 17 NA NA NA NA NA ...
$ taster_name : Factor w/ 20 levels "Alexander Peartree",..: 10 16 15 1 15 13 10 16 2 16 ...
$ taster_twitter_handle: Factor w/ 15 levels "@AnneInVino",..: 5 11 8 NA 8 13 5 11 NA 11 ...
$ title : Factor w/ 118840 levels ":Nota Bene 2005 Una Notte Red (Washington)",..: 79669 89457 89940 101059 102995 103740 105794 108715 54638 59312 ...
$ variety : Factor w/ 707 levels "Abouriou","Agiorgitiko",..: 692 452 438 481 442 593 188 211 211 438 ...
$ winery : Factor w/ 16757 levels ":Nota Bene","1+1=3",..: 11641 12988 13054 14432 14665 14740 15046 15435 8433 9014 ...
$ Scaled_Points : num -0.75 -0.562 -0.734 0.628 -0.734 ...
$ Percentile_Points : num 23.7 31.9 23.8 73.9 23.8 ...
$ country_other : chr "Italy_other" "Portugal_other" "US_other" "US_other" ...
$ location : Factor w/ 62 levels "Alsace","Argentina_other",..: 47 44 41 60 41 40 47 1 23 1 ...
$ year : num 2013 2011 2013 2013 2012 ...
- attr(*, "vars")= chr "taster_name"
- attr(*, "labels")='data.frame': 20 obs. of 1 variable:
..$ taster_name: chr "Alexander Peartree" "Anna Lee C. Iijima" "Anne Krebiehl MW" "Carrie Dykes" ...
..- attr(*, "vars")= chr "taster_name"
..- attr(*, "labels")='data.frame': 20 obs. of 1 variable:
.. ..$ taster_name: chr "Alexander Peartree" "Anna Lee C. Iijima" "Anne Krebiehl MW" "Carrie Dykes" ...
.. ..- attr(*, "vars")= chr "taster_name"
.. ..- attr(*, "drop")= logi TRUE
..- attr(*, "indices")=List of 20
.. ..$ : int 3 19 20 724 740 902 905 906 1202 1396 ...
.. ..$ : int 8 15 76 85 97 100 101 102 148 156 ...
.. ..$ : int 93 340 423 428 435 441 444 584 586 587 ...
.. ..$ : int 1625 1631 2379 3574 6939 8574 10533 10541 15859 15873 ...
.. ..$ : int 19635 40490 42883 57207 66923 92873
.. ..$ : int 1627 2380 2406 19477 19483 20404 29095 30291 30298 31221 ...
.. ..$ : int 219 351 873 1543 2480 2495 2740 2741 2742 3550 ...
.. ..$ : int 68 199 424 433 519 558 561 571 579 588 ...
.. ..$ : int 77 83 123 136 174 191 209 211 238 293 ...
.. ..$ : int 0 6 13 22 24 26 27 28 61 72 ...
.. ..$ : int 197 208 210 225 226 256 257 258 264 265 ...
.. ..$ : int 14 23 64 108 114 115 116 117 145 146 ...
.. ..$ : int 5 16 17 18 36 44 51 58 80 81 ...
.. ..$ : int 202 215 913 1023 1187 3336 3578 4038 4229 4709 ...
.. ..$ : int 2 4 21 35 41 78 173 233 248 251 ...
.. ..$ : int 1 7 9 11 30 42 49 53 63 65 ...
.. ..$ : int 59 62 67 70 86 94 421 422 429 432 ...
.. ..$ : int 230 271 308 325 409 410 415 596 984 1063 ...
.. ..$ : int 10 12 25 29 56 60 71 73 74 75 ...
.. ..$ : int 31 32 33 34 37 38 39 40 43 45 ...
..- attr(*, "drop")= logi TRUE
..- attr(*, "group_sizes")= int 383 4017 3290 129 6 24 436 3766 4766 9697 ...
..- attr(*, "biggest_group_size")= int 24917
- attr(*, "indices")=List of 20
..$ : int 3 19 20 724 740 902 905 906 1202 1396 ...
..$ : int 8 15 76 85 97 100 101 102 148 156 ...
..$ : int 93 340 423 428 435 441 444 584 586 587 ...
..$ : int 1625 1631 2379 3574 6939 8574 10533 10541 15859 15873 ...
..$ : int 19635 40490 42883 57207 66923 92873
..$ : int 1627 2380 2406 19477 19483 20404 29095 30291 30298 31221 ...
..$ : int 219 351 873 1543 2480 2495 2740 2741 2742 3550 ...
..$ : int 68 199 424 433 519 558 561 571 579 588 ...
..$ : int 77 83 123 136 174 191 209 211 238 293 ...
..$ : int 0 6 13 22 24 26 27 28 61 72 ...
..$ : int 197 208 210 225 226 256 257 258 264 265 ...
..$ : int 14 23 64 108 114 115 116 117 145 146 ...
..$ : int 5 16 17 18 36 44 51 58 80 81 ...
..$ : int 202 215 913 1023 1187 3336 3578 4038 4229 4709 ...
..$ : int 2 4 21 35 41 78 173 233 248 251 ...
..$ : int 1 7 9 11 30 42 49 53 63 65 ...
..$ : int 59 62 67 70 86 94 421 422 429 432 ...
..$ : int 230 271 308 325 409 410 415 596 984 1063 ...
..$ : int 10 12 25 29 56 60 71 73 74 75 ...
..$ : int 31 32 33 34 37 38 39 40 43 45 ...
- attr(*, "drop")= logi TRUE
- attr(*, "group_sizes")= int 383 4017 3290 129 6 24 436 3766 4766 9697 ...
- attr(*, "biggest_group_size")= int 24917
data <- data %>% select(-country_other, -taster_twitter_handle, -description, -winery, -designation)
# split train test
set.seed(2018)
train.index <- sample(2/3 * nrow(data))
train <- data[train.index,]
test <- data[-train.index,]
dim(train)
[1] 79992 13
summary(train)
country points price province region_1
US :33364 Min. : 80.00 Min. : 4.0 California:22172 Napa Valley : 2733
France :13682 1st Qu.: 86.00 1st Qu.: 17.0 Washington: 5244 Columbia Valley (WA): 2495
Italy :12151 Median : 88.00 Median : 25.0 Bordeaux : 3777 Russian River Valley: 1881
Spain : 4121 Mean : 88.44 Mean : 35.4 Tuscany : 3644 California : 1614
Portugal: 3597 3rd Qu.: 91.00 3rd Qu.: 42.0 Oregon : 3347 Willamette Valley : 1459
(Other) :13039 Max. :100.00 Max. :3300.0 (Other) :41770 (Other) :56775
NA's : 38 NA's :5641 NA's : 38 NA's :13035
region_2 taster_name
Central Coast : 6734 NA :16586
Sonoma : 5532 Roger Voss :15883
Columbia Valley : 4904 Michael Schachner: 9377
Napa : 4163 Kerin O’Keefe : 6527
Willamette Valley: 2137 Paul Gregutt : 5984
(Other) : 7403 Virginie Boone : 5717
NA's :49119 (Other) :19918
title variety
Korbel NV Brut Sparkling (California) : 6 Pinot Noir : 8080
Segura Viudas NV Aria Estate Extra Dry Sparkling (Cava): 6 Chardonnay : 7190
Segura Viudas NV Extra Dry Sparkling (Cava) : 6 Cabernet Sauvignon : 5848
Bailly-Lapierre NV Brut (Crémant de Bourgogne) : 5 Red Blend : 5577
Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County) : 5 Bordeaux-style Red Blend: 4276
Jacquart NV Brut Mosaïque (Champagne) : 5 Riesling : 3158
(Other) :79959 (Other) :45863
Scaled_Points Percentile_Points location year
Min. :-4.383631 Min. : 0.01031 California :22172 Min. :1904
1st Qu.:-0.726995 1st Qu.: 23.96647 Washington : 5244 1st Qu.:2008
Median :-0.023602 Median : 48.31980 Bordeaux : 3777 Median :2011
Mean :-0.001477 Mean : 49.93261 Tuscany : 3644 Mean :2011
3rd Qu.: 0.698512 3rd Qu.: 73.33108 Portugal_other: 3597 3rd Qu.:2013
Max. : 4.388765 Max. :100.00000 (Other) :41520 Max. :2017
NA's : 38 NA's :2909
dim(test)
[1] 39996 13
summary(test)
country points price province region_1
US :17093 Min. : 80.00 Min. : 4.00 California:11484 Napa Valley : 1441
France : 6671 1st Qu.: 86.00 1st Qu.: 17.00 Washington: 2721 Columbia Valley (WA): 1300
Italy : 5789 Median : 88.00 Median : 25.00 Bordeaux : 1779 Russian River Valley: 981
Spain : 1995 Mean : 88.44 Mean : 36.07 Tuscany : 1747 California : 854
Portugal: 1659 3rd Qu.: 91.00 3rd Qu.: 44.00 Oregon : 1582 Mendoza : 749
(Other) : 6768 Max. :100.00 Max. :2500.00 (Other) :20662 (Other) :28146
NA's : 21 NA's :2754 NA's : 21 NA's : 6525
region_2 taster_name
Central Coast : 3499 NA : 8331
Sonoma : 2858 Roger Voss : 7677
Columbia Valley : 2562 Michael Schachner: 4669
Napa : 2206 Kerin O’Keefe : 3170
Willamette Valley: 1005 Virginie Boone : 2991
(Other) : 3766 Paul Gregutt : 2884
NA's :24100 (Other) :10274
title variety
Gloria Ferrer NV Blanc de Noirs Sparkling (Carneros) : 4 Pinot Noir : 4198
Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County): 4 Chardonnay : 3678
Ruinart NV Blanc de Blancs Brut Chardonnay (Champagne): 4 Cabernet Sauvignon : 2992
A.R. Lenoble NV Terroirs Brut Rosé (Champagne) : 3 Red Blend : 2666
Canard-Duchêne NV Cuvée Léonie Brut (Champagne) : 3 Bordeaux-style Red Blend: 2195
Freixenet NV Carta Nevada Brut Sparkling (Cava) : 3 (Other) :24266
(Other) :39975 NA's : 1
Scaled_Points Percentile_Points location year
Min. :-4.383631 Min. : 0.01745 California :11484 Min. :1927
1st Qu.:-0.734211 1st Qu.: 23.96647 Washington : 2721 1st Qu.:2009
Median : 0.040399 Median : 53.38887 Bordeaux : 1779 Median :2011
Mean : 0.002954 Mean : 50.14612 Tuscany : 1747 Mean :2011
3rd Qu.: 0.735078 3rd Qu.: 74.94907 Portugal_other: 1659 3rd Qu.:2013
Max. : 3.993459 Max. :100.00000 (Other) :20585 Max. :2017
NA's : 21 NA's :1376
# clean train
clean_train <- train[lapply(train, function(x) sum(is.na(x)) / length(x)) < 0.1]
clean_train <- clean(clean_train)
clean_train$`US_vs_non-US` <- factor(ifelse(clean_train$country == 'US', 'US', 'non-US'))
clean_train[is.na(clean_train$country), 'US_vs_non-US'] <- NA
clean_train$`US_vs_non-US` <- addNA(clean_train$`US_vs_non-US`)
#clean_train <- clean_train %>% select(-title, -country)
clean_train <- clean_train %>% select(-Scaled_Points, -Percentile_Points)
dim(clean_train)
[1] 77038 10
ggplot(aes(x=points, y=price, col = taster_name), data = clean_train) + geom_jitter()

ggplot(aes(x=points, y=price, col = `US_vs_non-US`), data = clean_train) + geom_jitter() + theme(legend.position = "right")

ggplot(aes(x=points, y=price, col = variety), data = clean_train) + geom_jitter() + theme(legend.position = 'none')

ggplot(aes(x=year, y=price, col=`US_vs_non-US`), data = clean_train) + geom_jitter()

ggplot(aes(x=year, y=points, col=`US_vs_non-US`), data = clean_train) + geom_jitter()

# gam only: points and price
set.seed(2018)
k <- 10
sp <- split(c(1:nrow(train)), c(1:k))
data length is not a multiple of split variable
price_pt_gam_error <- matrix(NA, nrow=k, ncol=2)
for(i in 1:k){
cleanwine_train <- train[-sp[[i]], ]
cleanwine_test <- train[sp[[i]], ]
# data cleaning
cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x)) < 0.1]
cleanwine_train <- clean(cleanwine_train)
cleanwine_train <- cleanwine_train %>% select(-title, -country)
#print(colnames(cleanwine_train))
#print(head(cleanwine_train))
cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x)) < 0.1]
cleanwine_test <- clean(cleanwine_test)
cleanwine_test <- cleanwine_test %>% select(-title, -country)
#print(colnames(cleanwine_test))
#print(head(cleanwine_test))
# select only Percentile_Points and Price for gam
#cleanwine_train_gam <- cleanwine_train %>% select(Percentile_Points, price)
#cleanwine_test_gam <- cleanwine_test %>% select(Percentile_Points, price)
# gam
price_gam <- gam(price ~ s(Percentile_Points), data = cleanwine_train)
pt_gam <- gam(Percentile_Points ~ s(price), data = cleanwine_train)
price_pred <- predict(price_gam, cleanwine_test)
pt_pred <- predict(pt_gam, cleanwine_test)
price_pt_gam_error[i,1] <- mean(abs(price_pred- cleanwine_test$price))
price_pt_gam_error[i,2] <- mean((pt_pred - cleanwine_test$Percentile_Points)^2)
}
price_pt_gam_error
[,1] [,2]
[1,] 16.44460 589.3486
[2,] 17.22497 588.3934
[3,] 16.22723 593.4306
[4,] 15.96160 589.3421
[5,] 15.91975 601.6767
[6,] 16.13594 575.2521
[7,] 16.11612 586.4804
[8,] 16.03400 592.9562
[9,] 15.83370 580.9728
[10,] 16.39469 575.7507
# CV on train
set.seed(2018)
k <- 10
sp <- createFolds(train$variety, k)
price_fold_error <- matrix(NA, nrow=k, ncol=5)
pt_fold_error <- matrix(NA, nrow=k, ncol=5)
for(i in 1:k){
cleanwine_train <- train[-sp[[k]], ]
cleanwine_test <- train[sp[[k]], ]
# data cleaning
cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x)) < 0.1]
cleanwine_train <- clean(cleanwine_train)
cleanwine_train <- cleanwine_train %>% select(-title, -country)
#print(colnames(cleanwine_train))
cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x)) < 0.1]
cleanwine_test <- clean(cleanwine_test)
cleanwine_test <- cleanwine_test %>% select(-title, -country)
#print(colnames(cleanwine_test))
cleanwine_train <- cleanwine_train %>% select(-Scaled_Points, -Percentile_Points)
train.data.price <- cleanwine_train %>% select(-price)
#print(colnames(train.data.price))
train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
train.data.pt <- cleanwine_train %>% select(-points)
train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
cleanwine_test <- cleanwine_test %>% select(-Scaled_Points, -Percentile_Points)
test.data.price <- cleanwine_test %>% select(-price)
#print(colnames(test.data.price))
test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
#print(colnames(test.data.price))
test.data.pt <- cleanwine_test %>% select(-points)
test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
# random forest
subsamps <- seq(0.1, 1, 0.1)
train_error_price <- vector("numeric", length(seq(0.1, 1, 0.1)))
train_error_pt <- vector("numeric", length(seq(0.1, 1, 0.1)))
for (j in 1:length(seq(0.1, 1, 0.1))) {
rf_train_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
rf_train_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
train_error_price[j] <- as.numeric(rf_train_price$evaluation_log[,2])
train_error_pt[j] <- as.numeric(rf_train_pt$evaluation_log[,2])
}
index.min.price <- which.min(train_error_price)
index.min.pt <- which.min(train_error_pt)
rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[index.min.price], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
#print(rf_cleanwine_price$feature_names)
rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[index.min.pt], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
print(length(rf_cleanwine_pt$feature_names) == length(colnames(test.data.price)))
test.pred.price <- predict(rf_cleanwine_price, test.data.price)
price_fold_error[i, 1] <- mean(abs(cleanwine_test$price - test.pred.price))
test.pred.pt <- predict(rf_cleanwine_pt, test.data.pt)
pt_fold_error[i,1] <- mean((cleanwine_test$price - test.pred.pt)^2)
# svm
svm_cleanwine_price <- svm(price~., cleanwine_train)
svm_cleanwine_pt <- svm(points~., cleanwine_train)
result_price <- test(svm_cleanwine_price, cleanwine_test)
price_fold_error[i,2] <- mean(abs(cleanwine_test$price - result_price))
result_pt <- test(svm_cleanwine_pt, cleanwine_test)
pt_fold_error[i, 2] <- mean((cleanwine_test$points - result_pt)^2)
# gbm
gbmFit_price <- gbm(formula = price ~ ., data = cleanwine_train,
n.trees = 1000, shrinkage = 0.05, interaction.depth = 2, cv.folds = 10,
distribution = "laplace", verbose = FALSE)
best_iter_price <- gbm.perf(gbmFit_price, method = "cv", plot.it = F)
gbm_price_pred <- predict(gbmFit_price, newdata = cleanwine_test, n.trees = best_iter_price)
price_fold_error[i,3] <- mean(abs(cleanwine_test$price - gbm_price_pred))
gbmFit_pt <- gbm(formula = points ~ ., data = cleanwine_train,
n.trees = 1000, shrinkage = 0.05, interaction.depth = 2, cv.folds = 10,
distribution = "laplace", verbose = FALSE)
best_iter_pt <- gbm.perf(gbmFit_pt, method = "cv", plot.it = F)
gbm_pt_pred <- predict(gbmFit_pt, newdata = cleanwine_test, n.trees = best_iter_pt)
pt_fold_error[i,3] <- mean((cleanwine_test$points - gbm_pt_pred)^2)
# glmnet
# trn.mtx <- model.matrix(~.,cleanwine_train)
# trn.smtx <- Matrix(trn.mtx,sparse=T)[,-1]
#
# tst.mtx <- model.matrix(~.,cleanwine_test)
# tst.smtx <- Matrix(tst.mtx,sparse=T)[,-1]
fit.lasso.price <- cv.glmnet(x=train.data.price,cleanwine_train$price,alpha = 1,type.measure = "mae")
l.err.price <- predict(fit.lasso.price,newx = test.data.price,type = 'response')
price_fold_error[i,4] <- mean(abs(cleanwine_test$price - l.err.price))
fit.ridge.price <- cv.glmnet(x=train.data.price,cleanwine_train$price,alpha=0,type.measure = "mae")
r.err.price <- predict(fit.ridge.price,newx = test.data.price, type = 'response')
price_fold_error[i,5] <- mean(abs(cleanwine_test$price - r.err.price))
fit.lasso.pt <- cv.glmnet(x=train.data.pt,cleanwine_train$points,alpha = 1,type.measure = "mse")
l.err.pt <- predict(fit.lasso.pt,newx = test.data.pt,type = 'response')
pt_fold_error[i,4] <- mean((cleanwine_test$points - l.err.pt)^2)
fit.ridge.pt <- cv.glmnet(x=train.data.pt,cleanwine_train$points,alpha=0,type.measure = "mse")
r.err.pt <- predict(fit.ridge.pt,newx = test.data.pt, type = 'response')
pt_fold_error[i,5] <- mean((cleanwine_test$points - r.err.pt)^2)
}
[1] TRUE

[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE


[1] TRUE



colnames(price_fold_error) <- c("random_forest", "svm", "gbm", "lasso", "ridge")
colnames(pt_fold_error) <- c("random_forest", "svm", "gbm", "lasso", "ridge")
price_fold_error
random_forest svm gbm lasso ridge
[1,] 24.07756 12.65667 15.63732 14.77860 14.00466
[2,] 24.08077 12.71607 15.56506 14.78930 14.00466
[3,] 24.07985 12.62514 15.73344 14.80146 14.00466
[4,] 24.07889 12.87436 15.70824 14.77860 14.04065
[5,] 24.08126 12.53914 15.60783 14.77860 14.00466
[6,] 24.07933 12.70176 15.79083 14.77860 14.00466
[7,] 24.07805 12.80975 15.65192 14.76829 14.04065
[8,] 24.07805 12.77349 15.67500 14.81334 13.97669
[9,] 24.08122 12.82400 15.71277 14.78930 14.00466
[10,] 24.07889 12.66297 15.64105 14.78930 14.00466
pt_fold_error
random_forest svm gbm lasso ridge
[1,] 1254.200 5.070732 7.089304 6.659461 6.693300
[2,] 1254.195 5.056448 7.097442 6.610804 6.693300
[3,] 1254.215 5.036886 7.176921 6.582427 6.746977
[4,] 1254.203 5.054266 7.278442 6.690893 6.719021
[5,] 1254.229 5.033564 7.269607 6.643468 6.693300
[6,] 1254.256 5.049380 7.404647 6.596118 6.669693
[7,] 1254.204 5.045616 7.227224 6.596118 6.693300
[8,] 1254.198 5.058707 7.174839 6.627099 6.719021
[9,] 1254.213 5.045101 7.244036 6.582427 6.693300
[10,] 1254.260 5.042953 7.129418 6.582427 6.693300
result_test_price <- predict(svm_test_price, clean_test)
mean(abs(clean_test$price - result_test_price))
[1] 13.78876
result_test_pt <- predict(svm_test_pt, clean_test)
mean((clean_test$points - result_test_pt)^2)
[1] 5.553673
clean_test$country <- clean_test_country
clean_test$province <- clean_test_province
clean_test$price_pred <- result_test_price
clean_test$point_pred <- result_test_pt
write_csv(clean_test, "clean_test.csv")
price_pt_df <- data.frame(cbind(clean_test$points, clean_test$price, result_test_price, result_test_pt))
colnames(price_pt_df) <- c("points", "price", "pred_price", "pred_point")
ggplot(aes(x=points, y=price), data = price_pt_df) + geom_jitter() + geom_point(aes(x=points, y=pred_price), col="red")

ggplot(aes(x=price, y=points), data = price_pt_df) + geom_jitter() + geom_point(aes(x=price, y=pred_point), col="red")

price_pt_pred <- data.frame(cbind(result_test_price, result_test_pt))
colnames(price_pt_pred) <- c("price_pred", "point_pred")
clean_test_pred <- merge(clean_test, price_pt_pred)
# test
cleanwine <- data
cleanwine$location <- factor(cleanwine$location)
cleanwine$taster_name <- factor(cleanwine$taster_name)
cleanwine$title <- factor(cleanwine$title)
cleanwine$variety <- factor(cleanwine$variety)
cleanwine$taster_name <- addNA(cleanwine$taster_name)
cleanwine$region_1 <- factor(cleanwine$region_1)
cleanwine$region_2 <- factor(cleanwine$region_2)
cleanwine <- cleanwine[lapply(cleanwine, function(x) sum(is.na(x)) / length(x) ) < 0.1]
cleanwine <- clean(cleanwine)
cleanwine <- cleanwine %>% select(-title, -country)
summary(cleanwine)
tmp = data #%>% select(-Scaled_Points, -Percentile_Points)
train.idx <- sample(nrow(tmp), 4/5 * nrow(tmp))
cleanwine_train <- tmp[train.idx, ]
cleanwine_test <- tmp[-train.idx, ]
cleanwine_train$location <- factor(cleanwine_train$location)
cleanwine_train$taster_name <- factor(cleanwine_train$taster_name)
cleanwine_train$taster_name <- addNA(cleanwine_train$taster_name)
cleanwine_train$title <- factor(cleanwine_train$title)
cleanwine_train$variety <- factor(cleanwine_train$variety)
cleanwine_train$region_1 <- factor(cleanwine_train$region_1)
cleanwine_train$region_2 <- factor(cleanwine_train$region_2)
cleanwine_test$location <- factor(cleanwine_test$location)
cleanwine_test$taster_name <- factor(cleanwine_test$taster_name)
cleanwine_test$taster_name <- addNA(cleanwine_test$taster_name)
cleanwine_test$title <- factor(cleanwine_test$title)
cleanwine_test$variety <- factor(cleanwine_test$variety)
cleanwine_test$region_1 <- factor(cleanwine_test$region_1)
cleanwine_test$region_2 <- factor(cleanwine_test$region_2)
cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x) ) < 0.1]
cleanwine_train <- clean(cleanwine_train)
cleanwine_train <- cleanwine_train %>% select(-title, -country)
print(colnames(cleanwine_train))
cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x) ) < 0.1]
cleanwine_test <- clean(cleanwine_test)
cleanwine_test <- cleanwine_test %>% select(-title, -country)
print(colnames(cleanwine_test))
cleanwine_train <- cleanwine_train %>% select(-Scaled_Points, -Percentile_Points)
train.data.price <- cleanwine_train %>% select(-price)
print(colnames(train.data.price))
train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
train.data.pt <- cleanwine_train %>% select(-points)
train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
cleanwine_test <- cleanwine_test %>% select(-Scaled_Points, -Percentile_Points)
test.data.price <- cleanwine_test %>% select(-price)
print(colnames(test.data.price))
test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
print(colnames(test.data.price))
test.data.pt <- cleanwine_test %>% select(-points)
test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
# tmp = cleanwine %>% select(-Scaled_Points, -Percentile_Points)
#
# train.idx <- sample(nrow(tmp), 4/5 * nrow(tmp))
# tmp.train <- tmp[train.idx, ]
# tmp.test <- tmp[-train.idx, ]
# train.data.price <- tmp.train %>% select(-price)
# train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
# print(colnames(train.data.price))
# train.data.pt <- tmp.train %>% select(-points)
# train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
# test.data.price <- tmp.test %>% select(-price)
# test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
# print(colnames(test.data.price))
# test.data.pt <- tmp.test %>% select(-points)
# test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
subsamps <- seq(0.1, 1, 0.1)
colsamps <- seq(0.1, 1, 0.1)
#train_error <- matrix(NA, nrow = length(seq(0.1, 1, 0.1)), ncol = length(seq(0.1, 1, 0.1)))
train_error_price <- vector("numeric", length(seq(0.1, 1, 0.1)))
train_error_pt <- vector("numeric", length(seq(0.1, 1, 0.1)))
for (i in 1:length(seq(0.1, 1, 0.1))) {
#for (j in 1:length(seq(0.1, 1, 0.1))) {
rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, verbose = 0, max_depth = 5, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, verbose = 0, max_depth = 5, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
#train_error[i,j] <- as.numeric(rf_cleanwine$evaluation_log[,2])
train_error_price[i] <- as.numeric(rf_cleanwine_price$evaluation_log[,2])
train_error_pt[i] <- as.numeric(rf_cleanwine_pt$evaluation_log[,2])
#}
}
min.index.price <- which(train_error_price == min(train_error_price), arr.ind = TRUE)
min.index.pt <- which(train_error_pt == min(train_error_pt), arr.ind = TRUE)
rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[min.index.price], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
print(rf_cleanwine_price$feature_names)
rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[min.index.pt], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
test.pred.price <- predict(rf_cleanwine_price, test.data.price)
mean(abs(test.pred.price - tmp.test$price))
test.pred.pt <- predict(rf_cleanwine_pt, test.data.pt)
mean((test.pred.pt - tmp.test$points)^2)
which(train_error == min(train_error), arr.ind = TRUE)
#gammalist <- c(0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,0.045,0.05)
#tune.out <- tune.svm(tmp.data, tmp$price,
#kernel='linear', cost=2^(-1:5), gamma = gammalist)
#summary(tune.out)
svm_cleanwine_price <- svm(price~., tmp.train)
svm_cleanwine_price
svm_cleanwine_pt <- svm(points~., tmp.train)
svm_cleanwine_pt
result_price <- test(svm_cleanwine_price, tmp.test)
mean(abs(tmp$price - result_price))
result_pt <- test(svm_cleanwine_pt, tmp.test)
mean((tmp$points - result_pt)^2)
## Split in training and test data
train.idx <- sample(nrow(iris), 2/3 * nrow(iris))
iris.train <- iris[train.idx, ]
iris.test <- iris[-train.idx, ]
## Run case-specific RF
csrf(Species ~ ., training_data = iris.train, test_data = iris.test,
params1 = list(num.trees = 50, mtry = 4),
params2 = list(num.trees = 5))
#idx = sample(1:900)
tmp_train = tmp[1:900, ]
tmp_test = tmp[901:1000, ]
rf_cleanwine <- csrf(sparse.model.matrix(tmp_train %>% select(-price)), tmp_train$price,
params1 = list(num.trees = 500, mtry = 4),
params2 = list(num.trees = 50, mtry = 4))
rf_cleanwine <- csrf(price ~ ., cleanwine %>% select(-Scaled_Points), params1 = list(num.trees = 50, mtry = 4))
# gbm
gbmFit <- gbm(formula = price ~ ., data = cleanwine_price_10perc %>% select(-title, -winery),
n.trees = 1000, shrinkage = 0.01, interaction.depth = 2, cv.folds = 10,
distribution = "gaussian")
best_iter <- gbm.perf(gbmFit, method = "cv")
# Performance on whole dataset
diff_squared <- (cleanwine_price_10perc$price -
predict(gbmFit, newdata = cleanwine_price_10perc, n.trees = best_iter))^2
mean(diff_squared, na.rm = TRUE)
# ranger
csrf_wine <- csrf(formula = price ~ ., training_data = cleanwine_price_train, test_data = cleanwine_price_test, params1 = list(num.trees = 50, mtry = 4), params2 = list(num.trees = 5))
csrf_wine
# random forest
dat.cleanwine <- sparse.model.matrix(~., data = cleanwine_price_10perc %>% select(-title, -winery))[,-1]
price <- cleanwine_price_10perc$price
rf_wine <- randomForest(as.matrix(dat.cleanwine), price, do.trace = F,
importance = T)
rf_wine
#oob_wine <- mean(price - rf_wine$votes[,2])^2)
---
title: "707Wine_Benji_YLtry"
output: html_notebook
---
```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(glmnet)
library(gbm)
library(gam)
library(stringr)
library(xgboost)
library(caret)
library(Matrix)
library(e1071)
library(liquidSVM)
```

```{r}
# read data
read_csv("Data/wine-reviews/winemag-data-130k-v2.csv") %>% select(-X1) %>% unique -> data
```

```{r}
# preprocessing
scale_taster <- function(points){
    # takes a vector of numbers, subtracts every element by the mean of the vector, and then
    # divides every element by the standard deviation of the vector
    
    return((points - mean(points, na.rm = TRUE)) / sd(points, na.rm = TRUE))
}

percentile_taster <- function(x){
    # takes a vector of numbers, ranks every element and divides by n, giving the percentile of each element
    trunc(rank(x))/length(x) * 100
}

data <- data %>% group_by(taster_name) %>% mutate("Scaled_Points" = scale_taster(points))

data <- data %>% group_by(taster_name) %>% mutate("Percentile_Points" = percentile_taster(points))

tab <- data %>% group_by(province) %>% summarize("Proportion" = n()/nrow(data))
tab <- tab[tab$Proportion > 0.01, ]

tabcountry <-  data %>% group_by(country) %>% summarize("Proportion" = n()/nrow(data))
tabcountry <- tabcountry[tabcountry$Proportion > 0.01, ]

data$country_other <- ifelse(data$country %in% tabcountry$country, 
                                paste0(data$country, "_other"), data$country)
data$location <- ifelse(data$province %in% tab$province, data$province,
                                     data$country_other)

year <- str_extract_all(data$title, "[1-2][09][0-9]{2}")

data$year <- lapply(year, function(x){
    x = x %>% as.numeric
  if(!all(is.na(x))){
    newx <- x[(x > 1900) & (x < 2018)]
    if (!all(is.na(newx))) {
      newx <- max(newx)
      return(newx)
    } else {
      return(NA)
    }
  } else {
    return(NA)
  }}) %>% unlist

data$location <- factor(data$location)
data$taster_name <- factor(data$taster_name)
data$taster_name <- addNA(data$taster_name)
data$title <- factor(data$title)
data$variety <- factor(data$variety)
data$region_1 <- factor(data$region_1)
data$region_2 <- factor(data$region_2)
data$country <- factor(data$country)
data$province <- factor(data$province)
data$winery <- factor(data$winery)
data$taster_twitter_handle <- factor(data$taster_twitter_handle)
data$designation <- factor(data$designation)
```

```{r}
# helper function
impute_mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
    # impute_mean replaces missing values with the average value of a group

clean <- function(df){
    # clean removes the varieties that only have missing prices, and are thus unimputable by our rule,
    # and then it imputes the remaining missing prices using the average price of that wine's variety
    
    df %>% group_by(variety) %>% summarize("Average_Price" = mean(price, na.rm = T), 
                                           "Count" = n()) %>% 
    filter(is.na(Average_Price)) %>% select(variety) %>% unlist() -> drop_variety 
    
    df %>% filter(!(variety %in% drop_variety)) -> sample2
    
    sample2 %>% group_by(variety) %>% mutate(price = impute_mean(price)) -> sample2
    
    sample2 <- sample2[complete.cases(sample2),]
    return(sample2)
}
```

```{r}
# check data
dim(data)
summary(data)
str(data)

data <- data %>% select(-country_other, -taster_twitter_handle, -description, -winery, -designation)
```

```{r}
# split train test
set.seed(2018)
train.index <- sample(2/3 * nrow(data))
train <- data[train.index,]
test <- data[-train.index,]
dim(train)
summary(train)
dim(test)
summary(test)
```


```{r}
# clean train
clean_train <- train[lapply(train, function(x) sum(is.na(x)) / length(x))  < 0.1]
clean_train <- clean(clean_train)
clean_train$`US_vs_non-US` <- factor(ifelse(clean_train$country == 'US', 'US', 'non-US'))
clean_train[is.na(clean_train$country), 'US_vs_non-US'] <- NA
clean_train$`US_vs_non-US` <- addNA(clean_train$`US_vs_non-US`)
#clean_train <- clean_train %>% select(-title, -country)
clean_train <- clean_train %>% select(-Scaled_Points, -Percentile_Points)
dim(clean_train)
```


```{r}
ggplot(aes(x=points, y=price, col = taster_name), data = clean_train) + geom_jitter()
ggplot(aes(x=points, y=price, col = `US_vs_non-US`), data = clean_train) + geom_jitter() + theme(legend.position = "right")
ggplot(aes(x=points, y=price, col = variety), data = clean_train) + geom_jitter() + theme(legend.position = 'none')
ggplot(aes(x=year, y=price, col=`US_vs_non-US`), data = clean_train) + geom_jitter()
ggplot(aes(x=year, y=points, col=`US_vs_non-US`), data = clean_train) + geom_jitter()
```
```{r}
# gam only: points and price
set.seed(2018)
k <- 10
sp <- split(c(1:nrow(train)), c(1:k))
price_pt_gam_error <- matrix(NA, nrow=k, ncol=2)
for(i in 1:k){
    cleanwine_train <- train[-sp[[i]], ]
    cleanwine_test <- train[sp[[i]], ]
    
    # data cleaning
    cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x))  < 0.1]
    cleanwine_train <- clean(cleanwine_train)
    cleanwine_train <- cleanwine_train %>% select(-title, -country)
    #print(colnames(cleanwine_train))
    #print(head(cleanwine_train))
    
    cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x))  < 0.1]
    cleanwine_test <- clean(cleanwine_test)
    cleanwine_test <- cleanwine_test %>% select(-title, -country, -province)
    #print(colnames(cleanwine_test))
    #print(head(cleanwine_test))
    
    # select only Percentile_Points and Price for gam
    #cleanwine_train_gam <- cleanwine_train %>% select(Percentile_Points, price)
    #cleanwine_test_gam <- cleanwine_test %>% select(Percentile_Points, price)   
    
    # gam
    price_gam <- gam(price ~ s(Percentile_Points), data = cleanwine_train)
    pt_gam <- gam(Percentile_Points ~ s(price), data = cleanwine_train)
    price_pred <- predict(price_gam, cleanwine_test)
    pt_pred <- predict(pt_gam, cleanwine_test)
    price_pt_gam_error[i,1] <- mean(abs(price_pred- cleanwine_test$price))
    price_pt_gam_error[i,2] <- mean((pt_pred - cleanwine_test$Percentile_Points)^2)
}
```

```{r}
price_pt_gam_error
```


```{r}
# CV on train
set.seed(2018)
k <- 10
sp <- createFolds(train$variety, k)
price_fold_error <- matrix(NA, nrow=k, ncol=5)
pt_fold_error <- matrix(NA, nrow=k, ncol=5)
for(i in 1:k){
    cleanwine_train <- train[-sp[[k]], ]
    cleanwine_test <- train[sp[[k]], ]
    
    # data cleaning
    cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x))  < 0.1]
    cleanwine_train <- clean(cleanwine_train)
    cleanwine_train <- cleanwine_train %>% select(-title, -country, -province)
    #print(colnames(cleanwine_train))
    
    cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x))  < 0.1]
    cleanwine_test <- clean(cleanwine_test)
    cleanwine_test <- cleanwine_test %>% select(-title, -country)
    #print(colnames(cleanwine_test))
    
    cleanwine_train <- cleanwine_train %>% select(-Scaled_Points, -Percentile_Points)
    train.data.price <- cleanwine_train %>% select(-price)
    #print(colnames(train.data.price))
    train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
    train.data.pt <- cleanwine_train %>% select(-points)
    train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
    
    cleanwine_test <- cleanwine_test %>% select(-Scaled_Points, -Percentile_Points)
    test.data.price <- cleanwine_test %>% select(-price)
    #print(colnames(test.data.price))
    test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
    #print(colnames(test.data.price))
    test.data.pt <- cleanwine_test %>% select(-points)
    test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
    
    # random forest
    subsamps <- seq(0.1, 1, 0.1)
    train_error_price <- vector("numeric", length(seq(0.1, 1, 0.1)))
    train_error_pt <- vector("numeric", length(seq(0.1, 1, 0.1)))
    
    for (j in 1:length(seq(0.1, 1, 0.1))) {
      rf_train_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
      rf_train_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
      train_error_price[j] <- as.numeric(rf_train_price$evaluation_log[,2])
      train_error_pt[j] <- as.numeric(rf_train_pt$evaluation_log[,2])
    }
    index.min.price <- which.min(train_error_price)
    index.min.pt <- which.min(train_error_pt)
    
    rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[index.min.price], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
    #print(rf_cleanwine_price$feature_names)
    rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[index.min.pt], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
    print(length(rf_cleanwine_pt$feature_names) == length(colnames(test.data.price)))

    test.pred.price <- predict(rf_cleanwine_price, test.data.price)
    price_fold_error[i, 1] <- mean(abs(cleanwine_test$price - test.pred.price))
    test.pred.pt <- predict(rf_cleanwine_pt, test.data.pt)                               
    pt_fold_error[i,1] <- mean((cleanwine_test$price - test.pred.pt)^2)
    
    # svm
    svm_cleanwine_price <- svm(price~., cleanwine_train)
    svm_cleanwine_pt <- svm(points~., cleanwine_train)
    result_price <- test(svm_cleanwine_price, cleanwine_test)
    price_fold_error[i,2] <- mean(abs(cleanwine_test$price - result_price))

    result_pt <- test(svm_cleanwine_pt, cleanwine_test)
    pt_fold_error[i, 2] <- mean((cleanwine_test$points - result_pt)^2)
    
    # gbm
    gbmFit_price <- gbm(formula = price ~ ., data = cleanwine_train, 
                  n.trees = 1000, shrinkage = 0.05, interaction.depth = 2, cv.folds = 10, 
                  distribution = "laplace", verbose = FALSE)

    best_iter_price <- gbm.perf(gbmFit_price, method = "cv", plot.it = F)
    gbm_price_pred <- predict(gbmFit_price, newdata = cleanwine_test, n.trees = best_iter_price)
    price_fold_error[i,3] <- mean(abs(cleanwine_test$price - gbm_price_pred))
    
    gbmFit_pt <- gbm(formula = points ~ ., data = cleanwine_train, 
                  n.trees = 1000, shrinkage = 0.05, interaction.depth = 2, cv.folds = 10, 
                  distribution = "laplace", verbose = FALSE) 

    best_iter_pt <- gbm.perf(gbmFit_pt, method = "cv", plot.it = F)
    gbm_pt_pred <- predict(gbmFit_pt, newdata = cleanwine_test, n.trees = best_iter_pt)
    pt_fold_error[i,3] <- mean((cleanwine_test$points - gbm_pt_pred)^2)
    
    # glmnet
    # trn.mtx <- model.matrix(~.,cleanwine_train)
    # trn.smtx <- Matrix(trn.mtx,sparse=T)[,-1]
    # 
    # tst.mtx <- model.matrix(~.,cleanwine_test)
    # tst.smtx <- Matrix(tst.mtx,sparse=T)[,-1]
    
    fit.lasso.price <- cv.glmnet(x=train.data.price,cleanwine_train$price,alpha = 1,type.measure = "mae") 
    l.err.price <- predict(fit.lasso.price,newx = test.data.price,type = 'response')
    price_fold_error[i,4] <- mean(abs(cleanwine_test$price - l.err.price))
 
    fit.ridge.price <- cv.glmnet(x=train.data.price,cleanwine_train$price,alpha=0,type.measure = "mae")
    r.err.price <- predict(fit.ridge.price,newx = test.data.price, type = 'response')
    price_fold_error[i,5] <- mean(abs(cleanwine_test$price - r.err.price))
    
    fit.lasso.pt <- cv.glmnet(x=train.data.pt,cleanwine_train$points,alpha = 1,type.measure = "mse") 
    l.err.pt <- predict(fit.lasso.pt,newx = test.data.pt,type = 'response')
    pt_fold_error[i,4] <- mean((cleanwine_test$points - l.err.pt)^2)
 
    fit.ridge.pt <- cv.glmnet(x=train.data.pt,cleanwine_train$points,alpha=0,type.measure = "mse")
    r.err.pt <- predict(fit.ridge.pt,newx = test.data.pt, type = 'response')
    pt_fold_error[i,5] <- mean((cleanwine_test$points - r.err.pt)^2)
}
```

```{r}
colnames(price_fold_error) <- c("random_forest", "svm", "gbm", "lasso", "ridge")
colnames(pt_fold_error) <- c("random_forest", "svm", "gbm", "lasso", "ridge")
price_fold_error
pt_fold_error
```

```{r}
# clean_train
clean_train <- train[lapply(train, function(x) sum(is.na(x)) / length(x))  < 0.1]
clean_train <- clean(clean_train)
clean_train <- clean_train %>% select(-title, -country, -province)
clean_train <- clean_train %>% select(-Scaled_Points, -Percentile_Points)
dim(clean_train)

# pick best model and predict on test
clean_test <- test[lapply(test, function(x) sum(is.na(x)) / length(x) ) < 0.1]
clean_test <- clean(clean_test)
clean_test_country <- clean_test$country
clean_test_province <- clean_test$province
clean_test <- clean_test %>% select(-title, -country, -province)
clean_test <- clean_test %>% select(-Scaled_Points, -Percentile_Points)
dim(clean_test)
# test.data.price <- test %>% select(-price)
# test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
# test.data.pt <- test %>% select(-points)
# test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
    
svm_test_price <- svm(price~., clean_train)
svm_test_pt <- svm(points~., clean_train)
result_test_price <- predict(svm_test_price, clean_test)
mean(abs(clean_test$price - result_test_price))

result_test_pt <- predict(svm_test_pt, clean_test)
mean((clean_test$points - result_test_pt)^2)

clean_test$country <- clean_test_country
clean_test$province <- clean_test_province
clean_test$price_pred <- result_test_price
clean_test$point_pred <- result_test_pt

```

```{r}
write_csv(clean_test, "clean_test.csv")
```

```{r}
price_pt_df <- data.frame(cbind(clean_test$points, clean_test$price, result_test_price, result_test_pt))
colnames(price_pt_df) <- c("points", "price", "pred_price", "pred_point")
ggplot(aes(x=points, y=price), data = price_pt_df) + geom_jitter() + geom_point(aes(x=points, y=pred_price), col="red")
ggplot(aes(x=price, y=points), data = price_pt_df) + geom_jitter() + geom_point(aes(x=price, y=pred_point), col="red")
```

```{r}
save(clean_train, clean_test, price_fold_error, pt_fold_error, file = "~/Dropbox/Duke/707/Project/707Wine/wine_models.RData")
```


```{r}
# test
cleanwine <- data
cleanwine$location <- factor(cleanwine$location)
cleanwine$taster_name <- factor(cleanwine$taster_name)
cleanwine$title <- factor(cleanwine$title)
cleanwine$variety <- factor(cleanwine$variety)
cleanwine$taster_name <- addNA(cleanwine$taster_name)
cleanwine$region_1 <- factor(cleanwine$region_1)
cleanwine$region_2 <- factor(cleanwine$region_2)

cleanwine <- cleanwine[lapply(cleanwine, function(x) sum(is.na(x)) / length(x) ) < 0.1]
cleanwine <- clean(cleanwine)
cleanwine <- cleanwine %>% select(-title, -country)
summary(cleanwine)

```
```{r}

tmp = data #%>% select(-Scaled_Points, -Percentile_Points)

train.idx <- sample(nrow(tmp), 4/5 * nrow(tmp))
cleanwine_train <- tmp[train.idx, ]
cleanwine_test <- tmp[-train.idx, ]

cleanwine_train$location <- factor(cleanwine_train$location)
    cleanwine_train$taster_name <- factor(cleanwine_train$taster_name)
    cleanwine_train$taster_name <- addNA(cleanwine_train$taster_name)
    cleanwine_train$title <- factor(cleanwine_train$title)
    cleanwine_train$variety <- factor(cleanwine_train$variety)
    cleanwine_train$region_1 <- factor(cleanwine_train$region_1)
    cleanwine_train$region_2 <- factor(cleanwine_train$region_2)
    
    cleanwine_test$location <- factor(cleanwine_test$location)
    cleanwine_test$taster_name <- factor(cleanwine_test$taster_name)
    cleanwine_test$taster_name <- addNA(cleanwine_test$taster_name)
    cleanwine_test$title <- factor(cleanwine_test$title)
    cleanwine_test$variety <- factor(cleanwine_test$variety)
    cleanwine_test$region_1 <- factor(cleanwine_test$region_1)
    cleanwine_test$region_2 <- factor(cleanwine_test$region_2)

    cleanwine_train <- cleanwine_train[lapply(cleanwine_train, function(x) sum(is.na(x)) / length(x) ) < 0.1]
    cleanwine_train <- clean(cleanwine_train)
    cleanwine_train <- cleanwine_train %>% select(-title, -country)
    print(colnames(cleanwine_train))
    
    cleanwine_test <- cleanwine_test[lapply(cleanwine_test, function(x) sum(is.na(x)) / length(x) ) < 0.1]
    cleanwine_test <- clean(cleanwine_test)
    cleanwine_test <- cleanwine_test %>% select(-title, -country)
    print(colnames(cleanwine_test))
    
    cleanwine_train <- cleanwine_train %>% select(-Scaled_Points, -Percentile_Points)
    train.data.price <- cleanwine_train %>% select(-price)
    print(colnames(train.data.price))
    train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
    train.data.pt <- cleanwine_train %>% select(-points)
    train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
    
    cleanwine_test <- cleanwine_test %>% select(-Scaled_Points, -Percentile_Points)
    test.data.price <- cleanwine_test %>% select(-price)
    print(colnames(test.data.price))
    test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
    print(colnames(test.data.price))
    test.data.pt <- cleanwine_test %>% select(-points)
    test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]
```

```{r}
# tmp = cleanwine %>% select(-Scaled_Points, -Percentile_Points)
# 
# train.idx <- sample(nrow(tmp), 4/5 * nrow(tmp))
# tmp.train <- tmp[train.idx, ]
# tmp.test <- tmp[-train.idx, ]
# train.data.price <- tmp.train %>% select(-price)
# train.data.price <- sparse.model.matrix(~., train.data.price)[,-1]
# print(colnames(train.data.price))
# train.data.pt <- tmp.train %>% select(-points)
# train.data.pt <- sparse.model.matrix(~., train.data.pt)[,-1]
# test.data.price <- tmp.test %>% select(-price)
# test.data.price <- sparse.model.matrix(~., test.data.price)[,-1]
# print(colnames(test.data.price))
# test.data.pt <- tmp.test %>% select(-points)
# test.data.pt <- sparse.model.matrix(~., test.data.pt)[,-1]

subsamps <- seq(0.1, 1, 0.1)
colsamps <- seq(0.1, 1, 0.1)
#train_error <- matrix(NA, nrow = length(seq(0.1, 1, 0.1)), ncol = length(seq(0.1, 1, 0.1)))
train_error_price <- vector("numeric", length(seq(0.1, 1, 0.1)))
train_error_pt <- vector("numeric", length(seq(0.1, 1, 0.1)))
for (i in 1:length(seq(0.1, 1, 0.1))) {
  #for (j in 1:length(seq(0.1, 1, 0.1))) {
    rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, verbose = 0, max_depth = 5, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
    rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, verbose = 0, max_depth = 5, num_parallel_tree = 1000, subsample = subsamps[i], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')
    #train_error[i,j] <- as.numeric(rf_cleanwine$evaluation_log[,2])
    train_error_price[i] <- as.numeric(rf_cleanwine_price$evaluation_log[,2])
    train_error_pt[i] <- as.numeric(rf_cleanwine_pt$evaluation_log[,2])
  #}
}
min.index.price <- which(train_error_price == min(train_error_price), arr.ind = TRUE)
min.index.pt <- which(train_error_pt == min(train_error_pt), arr.ind = TRUE)
rf_cleanwine_price <- xgboost(data = train.data.price, label=cleanwine_train$price, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[min.index.price], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'mae')
print(rf_cleanwine_price$feature_names)
rf_cleanwine_pt <- xgboost(data = train.data.pt, label=cleanwine_train$points, max_depth = 0, verbose = 0, num_parallel_tree = 1000, subsample = subsamps[min.index.pt], nrounds = 1, colsample_bylevel=0.6, objective = "reg:linear", eval_metric = 'rmse')

test.pred.price <- predict(rf_cleanwine_price, test.data.price)
mean(abs(test.pred.price - tmp.test$price))

test.pred.pt <- predict(rf_cleanwine_pt, test.data.pt)
mean((test.pred.pt - tmp.test$points)^2)
```

```{r}
which(train_error == min(train_error), arr.ind = TRUE)
```
```{r}
#gammalist <- c(0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,0.045,0.05)
#tune.out <- tune.svm(tmp.data, tmp$price, 
                 #kernel='linear', cost=2^(-1:5), gamma = gammalist)
#summary(tune.out)

svm_cleanwine_price <- svm(price~., tmp.train)
svm_cleanwine_price

svm_cleanwine_pt <- svm(points~., tmp.train)
svm_cleanwine_pt
```
```{r}
result_price <- test(svm_cleanwine_price, tmp.test)
mean(abs(tmp$price - result_price))

result_pt <- test(svm_cleanwine_pt, tmp.test)
mean((tmp$points - result_pt)^2)
```



```{r}
## Split in training and test data
train.idx <- sample(nrow(iris), 2/3 * nrow(iris))
iris.train <- iris[train.idx, ]
iris.test <- iris[-train.idx, ]

## Run case-specific RF
csrf(Species ~ ., training_data = iris.train, test_data = iris.test, 
     params1 = list(num.trees = 50, mtry = 4), 
     params2 = list(num.trees = 5))
```


```{r}
#idx = sample(1:900)
tmp_train = tmp[1:900, ]
tmp_test  = tmp[901:1000, ]

rf_cleanwine <- csrf(sparse.model.matrix(tmp_train %>% select(-price)), tmp_train$price, 
                     params1 = list(num.trees = 500, mtry = 4), 
                     params2 = list(num.trees = 50, mtry = 4))
```


```{r}
rf_cleanwine <- csrf(price ~ ., cleanwine %>% select(-Scaled_Points), params1 = list(num.trees = 50, mtry = 4))
```



```{r}
# gbm
gbmFit <- gbm(formula = price ~ ., data = cleanwine_price_10perc %>% select(-title, -winery), 
              n.trees = 1000, shrinkage = 0.01, interaction.depth = 2, cv.folds = 10, 
              distribution = "gaussian")

best_iter <- gbm.perf(gbmFit, method = "cv")

# Performance on whole dataset

diff_squared <- (cleanwine_price_10perc$price - 
    predict(gbmFit, newdata = cleanwine_price_10perc, n.trees = best_iter))^2

mean(diff_squared, na.rm = TRUE)

```
```{r}
# ranger
csrf_wine <- csrf(formula = price ~ ., training_data = cleanwine_price_train, test_data = cleanwine_price_test, params1 = list(num.trees = 50, mtry = 4), params2 = list(num.trees = 5))
csrf_wine
```


```{r}
# random forest
dat.cleanwine <- sparse.model.matrix(~., data = cleanwine_price_10perc %>% select(-title, -winery))[,-1]
price <- cleanwine_price_10perc$price

rf_wine <- randomForest(as.matrix(dat.cleanwine), price, do.trace = F, 
                          importance = T) 
rf_wine
#oob_wine <- mean(price - rf_wine$votes[,2])^2)
```

